Finding Common Origins of Milky Way Stars

Author

Andersen Chang, Tiffany M. Tang, Tarek M. Zikry, Genevera I. Allen

Published

May 30, 2025

Dimension Reduction

Show Code to Fit Dimension Reduction Methods
## this code chunk fits the dimension reduction methods

# select dimension reduction hyperparameter grids
TSNE_PERPLEXITIES <- c(10, 30, 60, 100, 300)
UMAP_N_NEIGHBORS <- c(10, 30, 60, 100, 300)

# select dimension reduction methods
dr_fun_ls <- c(
  list("PCA" = fit_pca),
  purrr::map(
    TSNE_PERPLEXITIES,
    ~ purrr::partial(fit_tsne, dims = 2, perplexity = .x)
  ) |> 
    setNames(sprintf("tSNE (perplexity = %d)", TSNE_PERPLEXITIES)),
  purrr::map(
    UMAP_N_NEIGHBORS,
    ~ purrr::partial(fit_umap, dims = 2, n_neighbors = .x
    )
  ) |> 
    setNames(sprintf("UMAP (n_neighbors = %d)", UMAP_N_NEIGHBORS))
)

fit_results_fname <- file.path(RESULTS_PATH, "dimension_reduction_fits.rds")
if (!file.exists(fit_results_fname)) {
  # fit dimension reduction methods (if not already cached)
  dr_fit_ls <- purrr::map(
    train_data_ls,
    function(train_data) {
      purrr::map(dr_fun_ls, function(dr_fun) dr_fun(train_data))
    }
  )
  # save dimension reduction fits
  saveRDS(dr_fit_ls, file = fit_results_fname)
} else {
  # read in dimension reduction fits (if already cached)
  dr_fit_ls <- readRDS(fit_results_fname)
}

# aggregate all dimension reduction results into one df
plt_df <- purrr::list_flatten(dr_fit_ls, name_spec = "{inner} [{outer}]") |> 
  purrr::map(
    ~ .x$scores[, 1:2] |> 
      setNames(sprintf("Component %d", 1:2)) |> 
      dplyr::bind_cols(
        metadata$train |> dplyr::select(GC_NAME, GLAT, GLON)
      ) |> 
      dplyr::mutate(
        id = 1:dplyr::n()
      )
  ) |> 
  dr_results_to_df()
Show Code to Tune/Evaluate Dimension Reduction Methods
## this code chunk evaluates the dimension reduction methods

# evaluate neighborhood retention metric
Ks <- c(1, 5, 10, 25, 50, 100, 200, 300)

eval_results_fname <- file.path(RESULTS_PATH, "dimension_reduction_eval.rds")
if (!file.exists(eval_results_fname)) {
  # evaluate neighborhood retention (if not already cached)
  dr_eval_ls <- purrr::imap(
    dr_fit_ls,
    function(dr_out, key) {
      purrr::map(
        dr_out, 
        function(.x) {
          eval_neighborhood_retention(
            orig_data = train_data_ls[[key]],
            dr_data = .x$scores[, 1:min(ncol(.x$scores), 4)],
            ks = Ks
          )
        }
      )
    }
  )
  # save dimension reduction evaluation results
  saveRDS(dr_eval_ls, file = eval_results_fname)
} else {
  # read in dimension reduction evaluation results (if already cached)
  dr_eval_ls <- readRDS(eval_results_fname)
}

eval_plt_df <- purrr::list_flatten(
  dr_eval_ls, name_spec = "{inner} [{outer}]"
) |> 
  dr_results_to_df()

Hyperparameter Tuning via Neighborhood Retention

UMAP always below tSNE (100)

Dimension Reduction Plots

Dimension reduction visualizations, colored by GC.

Galactic coordinates plot (jittered), colored by value of the first component from dimension reduction method.

Galactic coordinates plot (jittered), colored by value of the first component from dimension reduction method.

Galactic coordinates plot (jittered), colored by value of the second component from dimension reduction method.

Galactic coordinates plot (jittered), colored by value of the second component from dimension reduction method.

Principal Component Loadings

Principal Component Loadings